library(xgboost)
library(caret)
data <- read.csv("/Users/MyDocs/work/ETEQ.csv")
out_data = data[,c('EBaPeq','Coal','Coking','Crudeoil','Gasoline','Diesel','FuelOil','Iron','Crudesteel','Al','Straw', 'Firewood', 'Population')]
set.seed(42)
folds <- createFolds(out_data$EBaPeq, k = 10, list = TRUE, returnTrain = TRUE)
rmse_values <- c()
r2_values <- c()
for (i in 1:10) {
  train_indices <- folds[[i]]
  train <- out_data[train_indices, ]
  test <- out_data[-train_indices, ]
  train_x <- train[, 2:13]
  train_y <- train[, 1]
  test_x <- test[, 2:13]
  test_y <- test[, 1]
  dtrain <- xgb.DMatrix(as.matrix(train_x), label = as.matrix(train_y))
  dtest <- xgb.DMatrix(as.matrix(test_x))
  params <- list(
    booster = "gbtree",
    eta = 0.05,
    max_depth = 5,
    min_child_weight = 1,
    subsample = 0.8,
    colsample_bytree = 0.8,
    objective = 'reg:squarederror',
    eval_metric = "rmse",
    nthread = 4
  )
  num_rounds <- 5000
  model <- xgb.train(params, dtrain, num_rounds)
  y_pred <- predict(model, dtest)
  rmse <- sqrt(mean((y_pred - test_y)^2))
  R_2 <- 1 - sum((test_y - y_pred)^2) / sum((test_y - mean(test_y))^2)
  rmse_values <- c(rmse_values, rmse)
  r2_values <- c(r2_values, R_2)
  print(paste0("Fold ", i, " - RMSE: ", rmse, ", R^2: ", R_2))
}

avg_rmse <- mean(rmse_values)
avg_r2 <- mean(r2_values)
print(paste0("Average RMSE: ", avg_rmse))
print(paste0("Average R^2: ", avg_r2))

out_data = data[, c('EBaPeq','Coal','Coking','Crudeoil','Gasoline','Diesel','FuelOil','Iron','Crudesteel','Al','Straw', 'Firewood', 'Population')]
train_index = sample(nrow(out_data), 0.7*nrow(out_data))
train = out_data[train_index,]
test = out_data
train_x = train[,2:13]
train_y = train[,1]
test_x = test[,2:13]
test_y = test[,1]
dtrain = xgb.DMatrix(as.matrix(train_x), label = as.matrix(train_y))
dtest = xgb.DMatrix(as.matrix(test_x))
params = list(
  booster = "gbtree",
  eta = 0.05,
  max_depth = 5,
  min_child_weight = 1,
  subsample = 0.8,
  colsample_bytree = 0.8,
  objective = 'reg:squarederror',
  eval_metric = "auc",
  nthread = 4
)
num_rounds = 5000
model = xgb.train(params, dtrain, num_rounds)
y_pred = predict(model, dtest)
rmse = sqrt(mean((y_pred - test_y) ^ 2))
print(paste0("RMSE: ", rmse))
R_2 = 1 - sum((test_y - y_pred)^2) / sum((test_y - mean(test_y))^2)
print(paste0("R_2: ", R_2))